############################################################################
#####################        function define     ###########################
############################################################################
cccol <- c("#CE0013","#16557A","#C7A609","#87C232","#64C0AB","#A14C94","#15A08C","#8B7E75","#1E7CAF","#EA425F","#46489A","#E50033","#0F231F","#1187CD")

error.bar <- function(x, y, upper, lower=upper, length=0.1,...){
  if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper))
    stop("vectors must be same length")
  arrows(x,y+upper, x, y-lower, angle=90, code=3, lwd=2, length=length, ...)
}

SpecificGene <- function(x,given_time,mean_cutoff){
    # find whether this gene is specific expressed in the given time point
    if (mean(x) >= mean_cutoff & x[given_time]==max(x)){
        return (x[given_time]/mean(x))
    }
    else{
        return (NA)
    }
}
############################################################################
#####################           read in data     ###########################
############################################################################
data <- read.table("../data/nsmb.2660-S2.txt",header=T,row.names=1)
Oocyte <- 1:3; Zygote <- 4:6; cell2 <- 7:12; cell4 <- 13:24; cell8 <- 25:44; Morula <- 45:60; 
MTE <- c(64,66,67,69,72,76:79);
PTE <- c(61:63,65,68,70,71,81,82); 
PE <- c(84:90);
EPI <- c(73:75,80,83);
hESC0 <- 91:98; hESC10 <- 99:124
avg <- cbind(apply(data[,Oocyte],1,mean),apply(data[,Zygote],1,mean),apply(data[,cell2],1,mean),apply(data[,cell4],1,mean),apply(data[,cell8],1,mean),apply(data[,Morula],1,mean),apply(data[,MTE],1,mean),apply(data[,PTE],1,mean),apply(data[,PE],1,mean),apply(data[,EPI],1,mean),apply(data[,hESC0],1,mean),apply(data[,hESC10],1,mean))
time_point <- c("Oocyte","Zygote","X2cell","X4cell","X8cell","Morula","MTE","PTE","PE","EPI","hESC0","hESC10")
dev_labels <- c("Oocyte","Zygote","2cell","4cell","8cell","Morula","MTE","PTE","PE","EPI","hESC0","hESC10")
colnames(avg) <- time_point
development_path <- time_point
dData <- log2(avg+1)

############ 2nd naive RNAseq
logfpkm2nd <- read.table("../data/2nd.reprogramming.lg2.all.fpkm.txt",header=T,row.names=1)
n_path <- c("hiF_r1","hiF_r2","he0_r1","he0_r2","he2_r1","he2_r2","he6_r1","he6_r2","n8_r1","n8_r2","n8_r3","n12_r1","n12_r2","n14_r1","n14_r2","n14_r3","n20_r1","n20_r2","n20_r3","n24p_r1","n24p_r2","n24m_r1","n24m_r2","niPS_r1","niPS_r2")
nData_tmp <- logfpkm2nd[,n_path]
nfpkm2nd <- 2**nData_tmp - 1

n_time_point <- c("hiF","he0","he2","he6","n8","n12","n14","n20","n24p","n24m","niPS")
n_label <- c("hiF-T","0d","2d","6d","8d","12d","14d","20d","24d+dox","24d-dox","niPSC-T")
nData2ndfpkm <- cbind(apply(nfpkm2nd[,1:2],1,mean),apply(nfpkm2nd[,3:4],1,mean),apply(nfpkm2nd[,5:6],1,mean),apply(nfpkm2nd[,7:8],1,mean),apply(nfpkm2nd[,9:11],1,mean),apply(nfpkm2nd[,12:13],1,mean),apply(nfpkm2nd[,14:16],1,mean),apply(nfpkm2nd[,17:19],1,mean),apply(nfpkm2nd[,20:21],1,mean),apply(nfpkm2nd[,22:23],1,mean),apply(nfpkm2nd[,24:25],1,mean))
colnames(nData2ndfpkm) <- n_time_point
rownames(nData2ndfpkm) <- rownames(nfpkm2nd)
nData <- log2(nData2ndfpkm+1)

############ 2nd primed RNAseq
pData2ndfpkm <- read.table("../data/paper.primed.fpkm.txt",header=T,row.names=1)
pData <- log2(pData2ndfpkm+1)

############ Normalize
library(edgeR)
genes <- intersect(row.names(nData),row.names(pData))
paperpath <- c("hiFT","d2","d5","d8","d14","d20","d24p","d24m","hiPST")
all_data <- cbind(logfpkm2nd[genes,n_path],pData[genes,c("hiFT","d2","d5","d8","d14","d20","d24p","d24m","hiPST")])
batch <- as.factor(c(rep(1,length(n_path)),rep(2,length(paperpath))))
rmbatch_data <- removeBatchEffect(all_data,batch=batch)

nData <- cbind(apply(rmbatch_data[,1:2],1,mean),apply(rmbatch_data[,3:4],1,mean),apply(rmbatch_data[,5:6],1,mean),apply(rmbatch_data[,7:8],1,mean),apply(rmbatch_data[,9:11],1,mean),apply(rmbatch_data[,12:13],1,mean),apply(rmbatch_data[,14:16],1,mean),apply(rmbatch_data[,17:19],1,mean),apply(rmbatch_data[,20:21],1,mean),apply(rmbatch_data[,22:23],1,mean),apply(rmbatch_data[,24:25],1,mean))
colnames(nData) <- n_time_point
rownames(nData) <- genes
pData <- rmbatch_data[genes,paperpath]

common_time_point <- c("hiF-T","2d","6d","8d","14d","20d","24d+dox","24d-dox","iPSC-T")

############################################################################
###########        specific 8c genes/ cluster 8c genes         #############
############################################################################
develop_8cell <- apply(avg,1,SpecificGene,"X8cell",1)
develop_8cell_gene <- names(sort(develop_8cell,decreasing=T)[1:500])
develop_8cell_gene <- intersect(intersect(develop_8cell_gene,rownames(pData2ndfpkm)),rownames(nData2ndfpkm))

cluster_8cell_genes <- as.vector(read.table("../data/kmcluster_36_naive2nd.txt")[,1])
cluster_8cell_genes <- intersect(intersect(cluster_8cell_genes,rownames(pData2ndfpkm)),rownames(nData2ndfpkm))

############################################################################
##############                      plot                  ##################
############################################################################
C2_genes <- as.vector(read.table("../Fig2/Cluster/14cluster_2_gene.txt")[,1])

d_eight_genes <- intersect(C2_genes,develop_8cell_gene)
c_eight_genes <- intersect(C2_genes,cluster_8cell_genes)

pdf("Fig3B.pdf",width=5.5,height=4)
par(mar=c(6,4,4,2))
all_eight_pData <- pData2ndfpkm[develop_8cell_gene,c("hiFT","d2","d5","d8","d14","d20","d24p","d24m","hiPST")]
all_eight_nData <- nData2ndfpkm[develop_8cell_gene,c("hiF","he2","he6","n8","n14","n20","n24p","n24m","niPS")]
log_all_eight_pData <- log2(all_eight_pData+1)
log_all_eight_nData <- log2(all_eight_nData+1)
pMean <- apply(log_all_eight_pData,2,mean)
nMean <- apply(log_all_eight_nData,2,mean)
pSd <- apply(log_all_eight_pData,2,sd)
nSd <- apply(log_all_eight_nData,2,sd)
pv1 <- pMean
pNum <- nrow(all_eight_pData)
pv2 <- pMean - qt(0.975, pNum-1) * pSd / sqrt(pNum)
pv3 <- pMean + qt(0.975, pNum-1) * pSd / sqrt(pNum)
nv1 <- nMean
nNum <- nrow(all_eight_nData)
nv2 <- nMean - qt(0.975, nNum-1) * nSd / sqrt(nNum)
nv3 <- nMean + qt(0.975, nNum-1) * nSd / sqrt(nNum)
xmax <- length(common_time_point)
ymax <- max(pv3,nv3)
ymin <- min(pv2,nv2)
plot(pv1,lwd=3,type="l",col=cccol[9],ylim=c(ymin,ymax),xlim=c(1,xmax),main="Expression dynamic change of all 8c genes",ylab="log2(fpkm+1)",xlab="",xaxt="n",lty=2,las=2)
axis(1,at=seq(xmax),labels=common_time_point,las=2)
polygon(c(1,1:ncol(log_all_eight_pData),ncol(log_all_eight_pData):2),c(pv2[1],pv3,pv2[ncol(log_all_eight_pData):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)
points(nv1,lwd=3,type="l",col=cccol[1])
polygon(c(1,1:ncol(log_all_eight_nData),ncol(log_all_eight_nData):2),c(nv2[1],nv3,nv2[ncol(log_all_eight_nData):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)
legend("topleft",c("naive","primed"),col=c(cccol[1],cccol[9]),lty=c(1,2),lwd=3,bty="n")

par(mar=c(6,4,4,2))
all_eight_pData <- pData2ndfpkm[cluster_8cell_genes,c("hiFT","d2","d5","d8","d14","d20","d24p","d24m","hiPST")]
all_eight_nData <- nData2ndfpkm[cluster_8cell_genes,c("hiF","he2","he6","n8","n14","n20","n24p","n24m","niPS")]
log_all_eight_pData <- log2(all_eight_pData+1)
log_all_eight_nData <- log2(all_eight_nData+1)
pMean <- apply(log_all_eight_pData,2,mean)
nMean <- apply(log_all_eight_nData,2,mean)
pSd <- apply(log_all_eight_pData,2,sd)
nSd <- apply(log_all_eight_nData,2,sd)
pv1 <- pMean
pNum <- nrow(all_eight_pData)
pv2 <- pMean - qt(0.975, pNum-1) * pSd / sqrt(pNum)
pv3 <- pMean + qt(0.975, pNum-1) * pSd / sqrt(pNum)
nv1 <- nMean
nNum <- nrow(all_eight_nData)
nv2 <- nMean - qt(0.975, nNum-1) * nSd / sqrt(nNum)
nv3 <- nMean + qt(0.975, nNum-1) * nSd / sqrt(nNum)
xmax <- length(common_time_point)
ymax <- max(pv3,nv3)
ymin <- min(pv2,nv2)
plot(pv1,lwd=3,type="l",col=cccol[9],ylim=c(ymin,ymax),xlim=c(1,xmax),main="Expression dynamic change of all 8c genes",ylab="log2(fpkm+1)",xlab="",xaxt="n",lty=2,las=2)
axis(1,at=seq(xmax),labels=common_time_point,las=2)
polygon(c(1,1:ncol(log_all_eight_pData),ncol(log_all_eight_pData):2),c(pv2[1],pv3,pv2[ncol(log_all_eight_pData):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)
points(nv1,lwd=3,type="l",col=cccol[1])
polygon(c(1,1:ncol(log_all_eight_nData),ncol(log_all_eight_nData):2),c(nv2[1],nv3,nv2[ncol(log_all_eight_nData):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)
legend("topleft",c("naive","primed"),col=c(cccol[1],cccol[9]),lty=c(1,2),lwd=3,bty="n")

par(mar=c(6,4,4,2))
eight_pData <- pData2ndfpkm[d_eight_genes,c("hiFT","d2","d5","d8","d14","d20","d24p","d24m","hiPST")]
eight_nData <- nData2ndfpkm[d_eight_genes,c("hiF","he2","he6","n8","n14","n20","n24p","n24m","niPS")]
logeight_pData <- log2(eight_pData+1)
logeight_nData <- log2(eight_nData+1)
pMean <- apply(logeight_pData,2,mean)
nMean <- apply(logeight_nData,2,mean)
pSd <- apply(logeight_pData,2,sd)
nSd <- apply(logeight_nData,2,sd)
pv1 <- pMean
pNum <- nrow(eight_pData)
pv2 <- pMean - qt(0.975, pNum-1) * pSd / sqrt(pNum)
pv3 <- pMean + qt(0.975, pNum-1) * pSd / sqrt(pNum)
nv1 <- nMean
nNum <- nrow(eight_nData)
nv2 <- nMean - qt(0.975, nNum-1) * nSd / sqrt(nNum)
nv3 <- nMean + qt(0.975, nNum-1) * nSd / sqrt(nNum)
xmax <- length(common_time_point)
ymax <- max(pv3,nv3)
plot(pv1,lwd=3,type="l",col=cccol[9],ylim=c(0,ymax),xlim=c(1,xmax),main="8c-gene in reprogramming",ylab="log2(fpkm+1)",xlab="",xaxt="n",lty=2,las=2)
axis(1,at=seq(xmax),labels=common_time_point,las=2)
polygon(c(1,1:ncol(logeight_pData),ncol(logeight_pData):2),c(pv2[1],pv3,pv2[ncol(logeight_pData):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)
points(nv1,lwd=3,type="l",col=cccol[1])
polygon(c(1,1:ncol(logeight_nData),ncol(logeight_nData):2),c(nv2[1],nv3,nv2[ncol(logeight_nData):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)
legend("topleft",c("naive","primed"),col=c(cccol[1],cccol[9]),lty=c(1,2),lwd=3,bty="n")

par(mar=c(6,4,4,2))
eight_pData <- pData2ndfpkm[c_eight_genes,c("hiFT","d2","d5","d8","d14","d20","d24p","d24m","hiPST")]
eight_nData <- nData2ndfpkm[c_eight_genes,c("hiF","he2","he6","n8","n14","n20","n24p","n24m","niPS")]
logeight_pData <- log2(eight_pData+1)
logeight_nData <- log2(eight_nData+1)
pMean <- apply(logeight_pData,2,mean)
nMean <- apply(logeight_nData,2,mean)
pSd <- apply(logeight_pData,2,sd)
nSd <- apply(logeight_nData,2,sd)
pv1 <- pMean
pNum <- nrow(eight_pData)
pv2 <- pMean - qt(0.975, pNum-1) * pSd / sqrt(pNum)
pv3 <- pMean + qt(0.975, pNum-1) * pSd / sqrt(pNum)
nv1 <- nMean
nNum <- nrow(eight_nData)
nv2 <- nMean - qt(0.975, nNum-1) * nSd / sqrt(nNum)
nv3 <- nMean + qt(0.975, nNum-1) * nSd / sqrt(nNum)
xmax <- length(common_time_point)
ymax <- max(pv3,nv3)
plot(pv1,lwd=3,type="l",col=cccol[9],ylim=c(0,ymax),xlim=c(1,xmax),main="8c-gene in reprogramming",ylab="log2(fpkm+1)",xlab="",xaxt="n",lty=2,las=2)
axis(1,at=seq(xmax),labels=common_time_point,las=2)
polygon(c(1,1:ncol(logeight_pData),ncol(logeight_pData):2),c(pv2[1],pv3,pv2[ncol(logeight_pData):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)
points(nv1,lwd=3,type="l",col=cccol[1])
polygon(c(1,1:ncol(logeight_nData),ncol(logeight_nData):2),c(nv2[1],nv3,nv2[ncol(logeight_nData):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)
legend("topleft",c("naive","primed"),col=c(cccol[1],cccol[9]),lty=c(1,2),lwd=3,bty="n")
dev.off()
